# 登录古诗文网站
import requests
from lxml import etree

'''
分析:
1.需要通过登录页面获取到__VIEWSTATE,__VIEWSTATEGENERATOR 值
2.图片验证码,这里使用人工输入的方式
'''

# 登录页面
login_page_url = 'https://www.gushiwen.cn/user/login.aspx?from=https%3A%2F%2Fwww.gushiwen.cn%2F'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}
response = requests.get(url=login_page_url, headers=headers)
login_page = response.text
# print(login_page)
# 使用xpath解析文件
tree = etree.HTML(login_page)
v1 = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
print(v1)
v2 = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
print(v2)

# 获取验证码图片
v3 = tree.xpath('//*[@id="imgCode"]/@src')[0]
v3_url = 'https://www.gushiwen.cn/' + v3
print(v3_url)

# 这种方式获取到的验证码不在同一个session无效
# import urllib.request
# urllib.request.urlretrieve(url=v3_url, filename='code.jpg')

# 获取验证码和发送登录请求都要使用这个session发起
session = requests.session()
response_code = session.get(v3_url)
content_code = response_code.content
# wb的模式将二进制写入到文件
with open('code.jpg', 'wb') as fp:
    fp.write(content_code)

code = input("请输入验证码:")
login_url = 'https://www.gushiwen.cn/user/login.aspx?from=https%3a%2f%2fwww.gushiwen.cn%2f'
data = {
    '__VIEWSTATE': v2,
    '__VIEWSTATEGENERATOR': v1,
    'from': 'https://www.gushiwen.cn/',
    'email': '18380278693',
    'pwd': 'gsww200913',
    'code': code,
    'denglu': '登录'
}

response = session.post(url=login_url, data=data, headers=headers)
content_html = response.text

with open('088_gushiwen.html', 'w', encoding='utf-8') as fp:
    fp.write(content_html)
